Libs e utils

library(tidyverse)
library(factoextra)
library(lubridate)
library(caret)
library(leaflet)
library(viridisLite)
library(gridExtra)
library(ggpubr)
library(corrplot)
library(ROCR)

getmode <- function(v) {
   uniqv <- unique(v[which(!is.na(v))])
   uniqv[which.max(tabulate(match(v, uniqv)))]
}

elbow <- function(data,k.max){
wss <- sapply(1:k.max, 
              function(k){kmeans(data, k, nstart=50,iter.max = 15 )$tot.withinss})
print(plot(1:k.max, wss,
     type="b", pch = 19, frame = FALSE, 
     xlab="Number of clusters K",
     ylab="Total within-clusters sum of squares"))

}

rocp = function(mod,test,y_test){

pred = predict(mod,test,"prob")[,2]    
p = prediction(pred,y_test)
perf = performance(p,measure = "tpr", x.measure = "fpr")


print(plot(perf,main = "ROC Curve"))

auc.perf = performance(p, measure = "auc")
return (auc.perf@y.values)
}

plot_density_pred = function(mod,test){
  pred = predict(mod,test,"prob")
  pred = cbind(pred,raintomorrow=test$raintomorrow)
  names(pred)[1:2] = c("No","Yes")
  print(ggplot(pred,aes(Yes,col=raintomorrow)) + geom_density() + theme_bw() + ggtitle("Distribuição das probabilidades preditas de Yes"))
  
}


Leitura dos dados

setwd("./processo_cientista-master/data")
rain_data <- read_csv("rain_data_aus.csv", 
                        col_types = cols(evaporation = col_double(), 
                        sunshine = col_double()))

for(i in 1:8){
        file = paste0("wind_table_0",i,".csv")
        if(i==1){
                wind_data = read_csv(file)
                cols = names(wind_data)
        }
        else{
                aux = read_csv(file)
                names(aux) = cols
                wind_data = rbind(wind_data,aux)
        }
}

coords = read_csv("city_coord.csv")

Introdução

O problema consiste em prever se irá ou não chover no dia seguinte. Os dados estão divididos em duas bases diferentes uma delas lida com informações metereologicas mais gerais e a outra com informações específicas sobre os ventos, inicialmente elas serão tratadas individualmente, mas eventualmente serão unidas.

Dados Chuvas


Para a base de dados sobre a chuva, ela possui 142193 observações e 23 variáveis que são:

##  [1] "date"             "location"         "mintemp"          "maxtemp"          "rainfall"         "evaporation"     
##  [7] "sunshine"         "humidity9am"      "humidity3pm"      "pressure9am"      "pressure3pm"      "cloud9am"        
## [13] "cloud3pm"         "temp9am"          "temp3pm"          "raintoday"        "amountOfRain"     "raintomorrow"    
## [19] "temp"             "humidity"         "precipitation3pm" "precipitation9am" "modelo_vigente"

A variável resposta é raintomorrow.

Inicialmente convertendo as colunas do tipo character para factor e validando os demais formatos, também removendo a coluna com as probabilidades do modelo vigente.

rain_data = rain_data %>% mutate_if(is.character, as.factor)
rain_data = rain_data %>% select(-modelo_vigente)
summary(rain_data)
##       date                location         mintemp         maxtemp         rainfall       evaporation        sunshine    
##  Min.   :2007-11-01   Canberra:  3418   Min.   :-8.50   Min.   :-4.80   Min.   :  0.00   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.:2011-01-06   Sydney  :  3337   1st Qu.: 7.60   1st Qu.:17.90   1st Qu.:  0.00   1st Qu.:  2.60   1st Qu.: 4.90  
##  Median :2013-05-27   Perth   :  3193   Median :12.00   Median :22.60   Median :  0.00   Median :  4.80   Median : 8.50  
##  Mean   :2013-04-01   Darwin  :  3192   Mean   :12.19   Mean   :23.23   Mean   :  2.35   Mean   :  5.47   Mean   : 7.62  
##  3rd Qu.:2015-06-12   Hobart  :  3188   3rd Qu.:16.80   3rd Qu.:28.20   3rd Qu.:  0.80   3rd Qu.:  7.40   3rd Qu.:10.60  
##  Max.   :2017-06-25   Brisbane:  3161   Max.   :33.90   Max.   :48.10   Max.   :371.00   Max.   :145.00   Max.   :14.50  
##                       (Other) :122704   NA's   :637     NA's   :322     NA's   :1406     NA's   :60843    NA's   :67816  
##   humidity9am      humidity3pm      pressure9am      pressure3pm        cloud9am        cloud3pm        temp9am     
##  Min.   :  0.00   Min.   :  0.00   Min.   : 980.5   Min.   : 977.1   Min.   :0.00    Min.   :0.0     Min.   :-7.20  
##  1st Qu.: 57.00   1st Qu.: 37.00   1st Qu.:1012.9   1st Qu.:1010.4   1st Qu.:1.00    1st Qu.:2.0     1st Qu.:12.30  
##  Median : 70.00   Median : 52.00   Median :1017.6   Median :1015.2   Median :5.00    Median :5.0     Median :16.70  
##  Mean   : 68.84   Mean   : 51.48   Mean   :1017.7   Mean   :1015.3   Mean   :4.44    Mean   :4.5     Mean   :16.99  
##  3rd Qu.: 83.00   3rd Qu.: 66.00   3rd Qu.:1022.4   3rd Qu.:1020.0   3rd Qu.:7.00    3rd Qu.:7.0     3rd Qu.:21.60  
##  Max.   :100.00   Max.   :100.00   Max.   :1041.0   Max.   :1039.6   Max.   :9.00    Max.   :9.0     Max.   :40.20  
##  NA's   :1774     NA's   :3610     NA's   :14014    NA's   :13981    NA's   :53657   NA's   :57094   NA's   :904    
##     temp3pm      raintoday      amountOfRain     raintomorrow      temp          humidity      precipitation3pm precipitation9am
##  Min.   :-5.40   No  :109332   Min.   :  0.000   No :110316   Min.   :-3.76   Min.   :  2.00   Min.   : 0.00    Min.   :-17.74  
##  1st Qu.:16.60   Yes : 31455   1st Qu.:  0.000   Yes: 31877   1st Qu.:22.52   1st Qu.: 44.00   1st Qu.: 8.00    1st Qu.:  6.65  
##  Median :21.10   NA's:  1406   Median :  0.000                Median :28.52   Median : 63.20   Median :10.00    Median : 10.00  
##  Mean   :21.69                 Mean   :  2.361                Mean   :28.51   Mean   : 61.99   Mean   :10.01    Mean   : 10.00  
##  3rd Qu.:26.40                 3rd Qu.:  0.800                3rd Qu.:35.48   3rd Qu.: 80.00   3rd Qu.:12.00    3rd Qu.: 13.39  
##  Max.   :46.70                 Max.   :371.000                Max.   :59.72   Max.   :122.00   Max.   :26.00    Max.   : 32.48  
##  NA's   :2726                                                 NA's   :322     NA's   :3610

Pode-se notar um grande número de valores faltantes.

Aproximadamente 9.08% dos valores da tabela são faltantes, isso será discutido mais a frente.

Dados Ventos

Já a outra base de dados, que lida com informações específicas sobre o vento numa localidade específica e uma data específica, possui 164386 observações e 8 variáveis que são:

## [1] "date"           "location"       "wind_gustdir"   "wind_gustspeed" "wind_dir9am"    "wind_dir3pm"    "wind_speed9am" 
## [8] "wind_speed3pm"


Corrigindo os campos character para factor e validando os demais tipos

wind_data = wind_data %>% mutate_if(is.character, as.factor)

summary(wind_data)
##       date                location       wind_gustdir    wind_gustspeed    wind_dir9am     wind_dir3pm     wind_speed9am   
##  Min.   :2007-11-01   Canberra:  3877   W      : 11451   Min.   :  6.00   N      :13294   SE     : 12019   Min.   :  0.00  
##  1st Qu.:2011-06-07   Sydney  :  3798   SE     : 10654   1st Qu.: 31.00   SE     :10503   W      : 11523   1st Qu.:  7.00  
##  Median :2014-01-13   Perth   :  3655   N      : 10513   Median : 39.00   E      :10397   S      : 10971   Median : 13.00  
##  Mean   :2013-09-25   Darwin  :  3653   E      : 10446   Mean   : 39.86   SSE    :10289   WSW    : 10729   Mean   : 13.96  
##  3rd Qu.:2016-04-24   Hobart  :  3650   WSW    : 10320   3rd Qu.: 48.00   NW     :10039   SW     : 10630   3rd Qu.: 19.00  
##  Max.   :2017-06-25   Brisbane:  3618   (Other):100424   Max.   :135.00   (Other):98436   (Other):103679   Max.   :130.00  
##                       (Other) :142135   NA's   : 10578   NA's   :10518    NA's   :11428   NA's   :  4835   NA's   :1453    
##  wind_speed3pm  
##  Min.   : 0.00  
##  1st Qu.:13.00  
##  Median :19.00  
##  Mean   :18.57  
##  3rd Qu.:24.00  
##  Max.   :87.00  
##  NA's   :3583


Já para esses dados os valores faltantes correspondem à 3.22% das observações, valor menor que da base de chuvas.

Análise Exploratória

Dados Chuvas

Dentro dos dados de chuva existem 18 colunas numéricas, 3 categóricas e uma de data, que serão avaliadas a seguir.

Variáveis Categóricas

raintomorrow

Essa é a variável resposta do problema, como é possível notar ela está distribuida de maneira bastante desbalanceada entre os seus possíveis resultados, o que demanda algumas mudanças na hora do desenvolvimento do modelo.

ggplot(rain_data,aes(raintomorrow)) + geom_bar() + theme_bw() + ylab("Contagem")

raintoday

Essa varíavel discrimina se no dia de hoje teve-se algum relato de chuva ou não

p1 = ggplot(rain_data,aes(raintoday)) + geom_bar()+ theme_bw() + ylab("Contagem")
p2=ggplot(rain_data,aes(raintoday, fill = raintomorrow)) + geom_bar(position = "dodge") + theme_bw() + ylab("Contagem")

grid.arrange(p1,p2,ncol=1)


É possível notar que nos dias que choveu a proporção de vezes que choveu no dia seguinte é muito maior.

prop.table(table(Choveu.Hoje = rain_data$raintoday, Chove.Amanha = rain_data$raintomorrow),margin = 1)
##            Chove.Amanha
## Choveu.Hoje        No       Yes
##         No  0.8481323 0.1518677
##         Yes 0.5359402 0.4640598

location

Localidade da medição, quase todas elas possuem a mesma frequência nos dados com exceção de Hobart, Nhil e Tuggeranong que distoam mais.

p1 = ggplot(rain_data,aes(location)) + geom_bar() + theme_bw() + ylab("Contagem") + theme(axis.text.x = element_text(angle = 90))

p2 = rain_data %>% group_by(location,raintomorrow) %>% summarise(total = n()) %>% mutate(Proporcao=total/sum(total)) %>% 
ggplot(aes(x=location,y=Proporcao, fill = raintomorrow)) + geom_bar(position = "dodge",stat="identity") + theme_bw() + ylab("Proporção") + theme(axis.text.x = element_text(angle = 90))

grid.arrange(p1,p2)

É notavel também que a proporção vezes que choveu no dia seguinte varia bastante entre as localidades, discriminando bem a variável resposta.

O Volume médio de chuva possui uma variação considerável levando em conta as localidades.

rain_data %>% filter(raintoday == "Yes") %>% group_by(location) %>% summarise(Media_Volume_Chuva = mean(amountOfRain)) %>% 
  ggplot(aes(location,Media_Volume_Chuva)) + geom_bar(stat="Identity") + theme_bw() + ggtitle("Média do volume de chuvas por localidade") + theme(axis.text.x = element_text(angle = 90))

Variáveis Numéricas

amountOfRain

Variável que mede a quantidade de chuva, entretanto sem informação se é sobre o dia atual ou o próximo.

ggplot(rain_data,aes(amountOfRain,col=raintomorrow)) + geom_density() + theme_bw()

aux = rain_data %>% select(amountOfRain,raintomorrow) %>% mutate(Zero_chuva = ifelse(amountOfRain==0,1,0),
                                                                 raintomorrow = ifelse(raintomorrow=='Yes',1,0))

table(Zero_Chuva = aux$Zero_chuva, Chove_Amanha = aux$raintomorrow)
##           Chove_Amanha
## Zero_Chuva     0     1
##          0 19239 31877
##          1 91077     0

Ou seja para os dias com amountOfRain = 0, 100% das vezes não choverá amanhã. Variável nao será utilizada para a modelagem pois muito provavelmente se refere à algum evento futuro.

Variáveis com muitos outliers: rainfall, evaporation, amountofrain.

As outras variáveis mostram um comportamento mais controlado, tendo a mediana como centro dos dados.

rain_data = rain_data %>% mutate(Dia = as.factor(day(date)), 
                                 Mes = as.factor(month(date)),
                                 Ano = as.factor(year(date)))


plots = list()
plots_resp = list()
i=1
for (col in names_numerical_columns) {
  p = ggplot(rain_data,aes_string(x="1",y=col))+geom_boxplot() +theme_bw() + ggtitle(col)
  p2 = ggplot(rain_data,aes_string(col,col="raintomorrow")) + geom_density() + theme_bw() + ggtitle(col)
  plots[[i]] = p
  plots_resp[[i]] = p2
  i = i+1
}

x = length(plots)

cols = round(sqrt(x),0)
rows = ceiling(x/cols)

ggarrange(plotlist = plots, ncol=cols, nrow = rows)

Comportamento das variáveis numéricas realizando a separação pela variável resposta

ggarrange(plotlist = plots_resp, ncol=cols, nrow = rows)

Individualmente nenhuma das variáveis possuiria um bom desempenho para dizer se chove ou não amanhã, entretanto algumas variáveis acabam chamando atenção como sunshine, nos dias onde essa variável mostra valores mais altos existe uma tendência à não chover no dia seguinte ou humidity3pm onde valores mais baixos mostram também a mesma tendência, entre outras.

corr = cor(rain_data[,names_numerical_columns], use = "complete.obs")

corrplot(corr,'number')

Pode-se notar algumas correlações altas esperadas como temperatura minima e a temperatura da as 9 da manhã e a quantidade de horas de sol com a quantidade de nuvens no céu, outros pontos são:

  • umidade e a umidade medida as 3 da tarde mostram uma forte correlação também, indicando que a médida de umidade normalmente foi feita na parte da tarde, o mesmo para a temperatura.

  • umidade 9 da manhã e das 3 tarde possuem média correlação indicando que essa medida não se altera muito ao longo do dia.

  • umidade e a quantidade de horas de sol possuem uma correlação média.

  • a variável que mede evaporação com a temperatura possuem uma correlação média


Variável Data

rain_data = rain_data %>% mutate(Ano = as.factor(year(date)),
                                 Mes = as.factor(month(date)),
                                 Dia = as.factor(day(date)),
                                 Week = as.factor(week(date)))

p1 = rain_data %>% group_by(Ano) %>% summarise(Qtd_Chuva = sum(rainfall,na.rm = T)) %>% 
                ggplot(aes(x=Ano,y=Qtd_Chuva,group=1)) + geom_line() + theme_bw() + theme(axis.text.x = element_text(angle = 90)) 


p2 = rain_data %>% group_by(Ano,Mes) %>% arrange(Ano,Mes) %>% summarise(Qtd_Chuva = sum(rainfall,na.rm = T)) %>% 
              ggplot(aes(x=Mes,y=Qtd_Chuva,group=factor(Ano),col=factor(Ano))) + geom_line() + theme_bw() + theme(axis.text.x = element_text(angle = 90)) 

p3 = rain_data %>% group_by(Ano,Week) %>% arrange(Ano,Week) %>% summarise(Qtd_Chuva = sum(rainfall,na.rm = T)) %>% 
              ggplot(aes(x=Week,y=Qtd_Chuva,group=factor(Ano),col=factor(Ano))) + geom_line() + theme_bw() + theme(axis.text.x = element_text(angle = 90))

grid.arrange(p1,p2,p3,ncol=1)

Aparentemente não existe nenhum tipo de padrão ao longo dos anos nos dados.

rain_data %>% group_by(Ano,Mes) %>% count() %>% mutate (AnoMes = factor(paste0(Ano,'-',Mes))) %>% 
                ggplot(aes(x=AnoMes,y=n)) + geom_bar(stat="identity") + theme_bw() + theme(axis.text.x = element_text(angle = 90)) 

Com exceção do período 2007/11 - 2008/09 as quantidades das medidas são muito próximas entre os meses

Existe diferença na proporção das vezes que choveu em cada um dos meses

rain_data %>% group_by(Mes) %>% mutate(choveu = ifelse(raintoday=='Yes',1,0)) %>% summarise(prop_1 = sum(choveu, na.rm = T), freq = n()) %>% mutate (prop = prop_1/freq) %>% ggplot(aes(x=Mes,y=prop)) + geom_bar(stat="identity") + theme_bw() + theme(axis.text.x = element_text(angle = 90)) 

prop.table(table(rain_data$Mes,rain_data$raintomorrow),margin=1)
##     
##             No       Yes
##   1  0.8066713 0.1933287
##   2  0.7903088 0.2096912
##   3  0.7866677 0.2133323
##   4  0.7821511 0.2178489
##   5  0.7747223 0.2252777
##   6  0.7381548 0.2618452
##   7  0.7307921 0.2692079
##   8  0.7474919 0.2525081
##   9  0.7702953 0.2297047
##   10 0.8043036 0.1956964
##   11 0.7874531 0.2125469
##   12 0.7918594 0.2081406

Teste chi quadrado para verificar se os grupos são estatisticamente diferentes, obtendo um p-valor de quase zero, confirmando a hipótese de efeito da variável mês.

chisq.test(rain_data$Mes,rain_data$raintomorrow)
## 
##  Pearson's Chi-squared test
## 
## data:  rain_data$Mes and rain_data$raintomorrow
## X-squared = 469.53, df = 11, p-value < 2.2e-16

Dados Ventos

Dentro dos dados de chuva existem 3 colunas numéricas, 4 categóricas e uma de data, que serão avaliadas a seguir.

Variáveis categoricas

Dentre as variáveis categóricas da base de ventos temos a que fala sobre a localidade da medição as outras três trazem informações sobre a direção que o vento vem.

location

A varíavel sobre a localização da medição, quase todas as localidade tiveram a praticamente mesma quantidade de medições com exceção de Hobart, Nhil e Uluru.

wind_data %>% group_by(location) %>% count() %>% ggplot(aes(location,n)) + geom_bar(stat='identity') + theme_bw() + theme(axis.text.x = element_text(angle = 90))

Variáveis de direção do vento

Existem 3 varíaveis que dizem a região que o vento está vindo:

  • wind_gustdir: a direção do vento mais forte daquele dia
  • wind_dir9am: a direção do vento as 9 da manhã
  • wind_dir3pm: a direção do vento as 3 da tarde

É possivel notar que existe diferença entre as proporções de dias que tiveram ou não chuva no dia seguinte dentre as direções dos ventos, entretanto os gráficos para as 3 variáveis são bastante parecidos. Entretanto a interação da variável de direção com a sua velocidade respectiva, possa ser importante no problema.

Adicionando a variável resposta do problema para a construção de algumas visualizações

wind_data = wind_data %>% left_join(rain_data %>% select(date,location,raintomorrow))

wind_data %>% group_by(wind_gustdir,raintomorrow) %>% summarise(n = n()) %>% mutate(Proporcao = n/sum(n)) %>%  ggplot(aes(x=wind_gustdir,y=Proporcao,fill=raintomorrow)) + geom_bar(position = "dodge",stat='identity') + theme_bw() + ggtitle("wind_gustdir")

wind_data %>% group_by(wind_dir9am,raintomorrow) %>% summarise(n = n()) %>% mutate(Proporcao = n/sum(n)) %>%  ggplot(aes(x=wind_dir9am,y=Proporcao,fill=raintomorrow)) + geom_bar(position = "dodge",stat='identity') + theme_bw() + ggtitle("wind_dir9am")

wind_data %>% group_by(wind_dir3pm,raintomorrow) %>% summarise(n = n()) %>% mutate(Proporcao = n/sum(n)) %>%  ggplot(aes(x=wind_dir3pm,y=Proporcao,fill=raintomorrow)) + geom_bar(position = "dodge",stat='identity') + theme_bw() + ggtitle("wind_dir3pm")

Variáveis Numéricas

Essas variáveis medem a velocidade do vento:

  • wind_gustspeed: a velocidade do vento mais forte daquele dia
  • wind_speed9am: a velocidade do vento as 9 da manhã
  • wind_speed3pm: a velocidade do vento as 3 da tarde
    Todas estão com um número aceitável de outliers e a mediana parece bem centralizada próxima da média
ggplot(wind_data,aes(x=1,wind_gustspeed)) + geom_boxplot() + theme_bw() + ggtitle("wind_gustspeed")

ggplot(wind_data,aes(x=1,wind_speed9am)) + geom_boxplot() + theme_bw() + ggtitle("wind_speed9am")

ggplot(wind_data,aes(x=1,wind_speed3pm)) + geom_boxplot() + theme_bw() + ggtitle("wind_speed3pm")

Relações com a variável resposta.

ggplot(wind_data,aes(wind_gustspeed, col = raintomorrow)) + geom_density() + theme_bw() + ggtitle("wind_gustspeed")
## Warning: Removed 10518 rows containing non-finite values (stat_density).

ggplot(wind_data,aes(wind_speed9am, col = raintomorrow)) + geom_density() + theme_bw() + ggtitle("wind_speed9am")
## Warning: Removed 1453 rows containing non-finite values (stat_density).

ggplot(wind_data,aes(wind_speed3pm, col = raintomorrow)) + geom_density() + theme_bw() + ggtitle("wind_speed3pm")
## Warning: Removed 3583 rows containing non-finite values (stat_density).

wind_data = wind_data %>% select(-raintomorrow)

wind_gustspeed é a que possui maior distinção entre as observações que chovem ou não no dia seguinte, mas nada muito conclusivo. Como foi dito anteriormente essa é uma avaliação da variável individualmente sem levar em conta interação com outras variáveis.

Feature Engineering e tratamento dos dados faltantes

Dados Chuvas

Como visto anteriormente existem muitos valores faltantes na base de dados de chuvas.

summary(rain_data)
##       date                location         mintemp         maxtemp         rainfall       evaporation        sunshine    
##  Min.   :2007-11-01   Canberra:  3418   Min.   :-8.50   Min.   :-4.80   Min.   :  0.00   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.:2011-01-06   Sydney  :  3337   1st Qu.: 7.60   1st Qu.:17.90   1st Qu.:  0.00   1st Qu.:  2.60   1st Qu.: 4.90  
##  Median :2013-05-27   Perth   :  3193   Median :12.00   Median :22.60   Median :  0.00   Median :  4.80   Median : 8.50  
##  Mean   :2013-04-01   Darwin  :  3192   Mean   :12.19   Mean   :23.23   Mean   :  2.35   Mean   :  5.47   Mean   : 7.62  
##  3rd Qu.:2015-06-12   Hobart  :  3188   3rd Qu.:16.80   3rd Qu.:28.20   3rd Qu.:  0.80   3rd Qu.:  7.40   3rd Qu.:10.60  
##  Max.   :2017-06-25   Brisbane:  3161   Max.   :33.90   Max.   :48.10   Max.   :371.00   Max.   :145.00   Max.   :14.50  
##                       (Other) :122704   NA's   :637     NA's   :322     NA's   :1406     NA's   :60843    NA's   :67816  
##   humidity9am      humidity3pm      pressure9am      pressure3pm        cloud9am        cloud3pm        temp9am     
##  Min.   :  0.00   Min.   :  0.00   Min.   : 980.5   Min.   : 977.1   Min.   :0.00    Min.   :0.0     Min.   :-7.20  
##  1st Qu.: 57.00   1st Qu.: 37.00   1st Qu.:1012.9   1st Qu.:1010.4   1st Qu.:1.00    1st Qu.:2.0     1st Qu.:12.30  
##  Median : 70.00   Median : 52.00   Median :1017.6   Median :1015.2   Median :5.00    Median :5.0     Median :16.70  
##  Mean   : 68.84   Mean   : 51.48   Mean   :1017.7   Mean   :1015.3   Mean   :4.44    Mean   :4.5     Mean   :16.99  
##  3rd Qu.: 83.00   3rd Qu.: 66.00   3rd Qu.:1022.4   3rd Qu.:1020.0   3rd Qu.:7.00    3rd Qu.:7.0     3rd Qu.:21.60  
##  Max.   :100.00   Max.   :100.00   Max.   :1041.0   Max.   :1039.6   Max.   :9.00    Max.   :9.0     Max.   :40.20  
##  NA's   :1774     NA's   :3610     NA's   :14014    NA's   :13981    NA's   :53657   NA's   :57094   NA's   :904    
##     temp3pm      raintoday      amountOfRain     raintomorrow      temp          humidity      precipitation3pm precipitation9am
##  Min.   :-5.40   No  :109332   Min.   :  0.000   No :110316   Min.   :-3.76   Min.   :  2.00   Min.   : 0.00    Min.   :-17.74  
##  1st Qu.:16.60   Yes : 31455   1st Qu.:  0.000   Yes: 31877   1st Qu.:22.52   1st Qu.: 44.00   1st Qu.: 8.00    1st Qu.:  6.65  
##  Median :21.10   NA's:  1406   Median :  0.000                Median :28.52   Median : 63.20   Median :10.00    Median : 10.00  
##  Mean   :21.69                 Mean   :  2.361                Mean   :28.51   Mean   : 61.99   Mean   :10.01    Mean   : 10.00  
##  3rd Qu.:26.40                 3rd Qu.:  0.800                3rd Qu.:35.48   3rd Qu.: 80.00   3rd Qu.:12.00    3rd Qu.: 13.39  
##  Max.   :46.70                 Max.   :371.000                Max.   :59.72   Max.   :122.00   Max.   :26.00    Max.   : 32.48  
##  NA's   :2726                                                 NA's   :322     NA's   :3610                                      
##       Dia              Mes             Ano             Week       
##  1      :  4688   5      :13055   2016   :17508   19     :  2960  
##  15     :  4688   3      :13036   2014   :17400   22     :  2957  
##  16     :  4687   1      :12921   2015   :17231   20     :  2948  
##  2      :  4686   6      :12389   2009   :16595   23     :  2946  
##  13     :  4686   10     :11804   2010   :16419   10     :  2945  
##  6      :  4684   7      :11779   2013   :16097   25     :  2944  
##  (Other):114074   (Other):67209   (Other):40943   (Other):124493

Por início será utilizado a média mensal ao longo dos anos para dada localidade, entretanto para as variáveis citadas como com muitos outliers será utilizado a mediana.

medias_mes_local = rain_data %>% group_by(location,Mes) %>%
                   summarise_at(c(vars(mintemp:maxtemp,sunshine:temp3pm,temp,humidity),vars(rainfall,evaporation,amountOfRain)),funs(mean(.,na.rm = T), median(.,na.rm = T)))

rain_data_clean_1 = rain_data %>% left_join(medias_mes_local, by = c("location","Mes")) %>% 
        mutate(mintemp = coalesce(mintemp, mintemp_mean),
               maxtemp = coalesce(maxtemp, maxtemp_mean),
               rainfall = coalesce(rainfall, rainfall_median),
               evaporation = coalesce(evaporation, evaporation_median),
               sunshine = coalesce(sunshine, sunshine_mean),
               humidity9am = coalesce(humidity9am, humidity9am_mean),
               humidity3pm = coalesce(humidity3pm, humidity3pm_mean),
               pressure9am = coalesce(pressure9am, pressure9am_mean),
               pressure3pm = coalesce(pressure3pm, pressure3pm_mean),
               cloud9am = coalesce(cloud9am, cloud9am_mean),
               cloud3pm = coalesce(cloud3pm, cloud3pm_mean),
               temp9am = coalesce(temp9am, temp9am_mean),
               temp3pm = coalesce(temp3pm, temp3pm_mean),
               temp = coalesce(temp, temp_mean),
               humidity = coalesce(humidity, humidity_mean)) %>% select(-contains("mean"),-contains("median"))

Redução de quase 100 mil NA

sum(is.na(rain_data_clean_1))
## [1] 190589

Os campos que continuam com NA, não obtiveram nenhuma medida da variável dentro daquela cidade no período todo

summary(rain_data_clean_1)
##       date                location         mintemp         maxtemp         rainfall        evaporation        sunshine    
##  Min.   :2007-11-01   Canberra:  3418   Min.   :-8.50   Min.   :-4.80   Min.   :  0.000   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.:2011-01-06   Sydney  :  3337   1st Qu.: 7.60   1st Qu.:17.90   1st Qu.:  0.000   1st Qu.:  2.60   1st Qu.: 5.40  
##  Median :2013-05-27   Perth   :  3193   Median :12.00   Median :22.60   Median :  0.000   Median :  4.60   Median : 8.20  
##  Mean   :2013-04-01   Darwin  :  3192   Mean   :12.19   Mean   :23.23   Mean   :  2.327   Mean   :  5.31   Mean   : 7.64  
##  3rd Qu.:2015-06-12   Hobart  :  3188   3rd Qu.:16.80   3rd Qu.:28.20   3rd Qu.:  0.600   3rd Qu.:  7.20   3rd Qu.:10.30  
##  Max.   :2017-06-25   Brisbane:  3161   Max.   :33.90   Max.   :48.10   Max.   :371.000   Max.   :145.00   Max.   :14.50  
##                       (Other) :122704                                                     NA's   :45482    NA's   :52071  
##   humidity9am      humidity3pm      pressure9am      pressure3pm        cloud9am        cloud3pm        temp9am     
##  Min.   :  0.00   Min.   :  0.00   Min.   : 980.5   Min.   : 977.1   Min.   :0.00    Min.   :0.00    Min.   :-7.20  
##  1st Qu.: 57.00   1st Qu.: 37.00   1st Qu.:1013.0   1st Qu.:1010.5   1st Qu.:2.00    1st Qu.:2.00    1st Qu.:12.20  
##  Median : 70.00   Median : 52.00   Median :1017.6   Median :1015.2   Median :5.00    Median :5.00    Median :16.70  
##  Mean   : 68.92   Mean   : 51.59   Mean   :1017.6   Mean   :1015.2   Mean   :4.57    Mean   :4.58    Mean   :16.96  
##  3rd Qu.: 83.00   3rd Qu.: 66.00   3rd Qu.:1022.3   3rd Qu.:1019.9   3rd Qu.:7.00    3rd Qu.:7.00    3rd Qu.:21.50  
##  Max.   :100.00   Max.   :100.00   Max.   :1041.0   Max.   :1039.6   Max.   :9.00    Max.   :9.00    Max.   :40.20  
##                                    NA's   :11781    NA's   :11781    NA's   :34034   NA's   :34034                  
##     temp3pm      raintoday      amountOfRain     raintomorrow      temp          humidity      precipitation3pm precipitation9am
##  Min.   :-5.40   No  :109332   Min.   :  0.000   No :110316   Min.   :-3.76   Min.   :  2.00   Min.   : 0.00    Min.   :-17.74  
##  1st Qu.:16.60   Yes : 31455   1st Qu.:  0.000   Yes: 31877   1st Qu.:22.52   1st Qu.: 44.00   1st Qu.: 8.00    1st Qu.:  6.65  
##  Median :21.10   NA's:  1406   Median :  0.000                Median :28.52   Median : 63.20   Median :10.00    Median : 10.00  
##  Mean   :21.72                 Mean   :  2.361                Mean   :28.51   Mean   : 62.11   Mean   :10.01    Mean   : 10.00  
##  3rd Qu.:26.50                 3rd Qu.:  0.800                3rd Qu.:35.48   3rd Qu.: 80.00   3rd Qu.:12.00    3rd Qu.: 13.39  
##  Max.   :46.70                 Max.   :371.000                Max.   :59.72   Max.   :122.00   Max.   :26.00    Max.   : 32.48  
##                                                                                                                                 
##       Dia              Mes             Ano             Week       
##  1      :  4688   5      :13055   2016   :17508   19     :  2960  
##  15     :  4688   3      :13036   2014   :17400   22     :  2957  
##  16     :  4687   1      :12921   2015   :17231   20     :  2948  
##  2      :  4686   6      :12389   2009   :16595   23     :  2946  
##  13     :  4686   10     :11804   2010   :16419   10     :  2945  
##  6      :  4684   7      :11779   2013   :16097   25     :  2944  
##  (Other):114074   (Other):67209   (Other):40943   (Other):124493

Para resolver isso será utilizado a média mensal de um grupo de localidades, construido a partir de uma análise de clusters. Como utilizarei os dados de ventos também para a construção desses clusters, irei tratá-los primeiro.

Dados Ventos

Tratativa dos dados de vento para realizar um join com a tabela principal com intuito de obter mais informações para tratar os NA restantes.

summary(wind_data)
##       date                location       wind_gustdir    wind_gustspeed    wind_dir9am     wind_dir3pm     wind_speed9am   
##  Min.   :2007-11-01   Canberra:  3877   W      : 11451   Min.   :  6.00   N      :13294   SE     : 12019   Min.   :  0.00  
##  1st Qu.:2011-06-07   Sydney  :  3798   SE     : 10654   1st Qu.: 31.00   SE     :10503   W      : 11523   1st Qu.:  7.00  
##  Median :2014-01-13   Perth   :  3655   N      : 10513   Median : 39.00   E      :10397   S      : 10971   Median : 13.00  
##  Mean   :2013-09-25   Darwin  :  3653   E      : 10446   Mean   : 39.86   SSE    :10289   WSW    : 10729   Mean   : 13.96  
##  3rd Qu.:2016-04-24   Hobart  :  3650   WSW    : 10320   3rd Qu.: 48.00   NW     :10039   SW     : 10630   3rd Qu.: 19.00  
##  Max.   :2017-06-25   Brisbane:  3618   (Other):100424   Max.   :135.00   (Other):98436   (Other):103679   Max.   :130.00  
##                       (Other) :142135   NA's   : 10578   NA's   :10518    NA's   :11428   NA's   :  4835   NA's   :1453    
##  wind_speed3pm  
##  Min.   : 0.00  
##  1st Qu.:13.00  
##  Median :19.00  
##  Mean   :18.57  
##  3rd Qu.:24.00  
##  Max.   :87.00  
##  NA's   :3583

Existem dias com mais de uma observação para a mesma cidade

dup = wind_data %>% group_by(location,date) %>% count()

table(dup$n)
## 
##      1      2 
## 120000  22193

As repetições são linhas duplicadas pois a quantidade de linhas duplicadas e do conjunto location/date que se repetem é a mesma

sum(duplicated(wind_data))
## [1] 22193

Removendo as linhas duplicadas

wind_data = wind_data[!duplicated(wind_data),]

Para os campos onde a velocidade do vento é zero, irei substituir a direção que possui NA por X

wind_data = wind_data %>% mutate_if(is.factor, as.character)

wind_data$wind_gustdir[wind_data$wind_gustspeed==0] = "X"
wind_data$wind_dir9am[wind_data$wind_speed9am==0] = "X"
wind_data$wind_dir3pm[wind_data$wind_speed3pm==0] = "X"

wind_data = wind_data %>% mutate_if(is.character, as.factor)
summary(wind_data)
##       date                location       wind_gustdir   wind_gustspeed    wind_dir9am     wind_dir3pm    wind_speed9am 
##  Min.   :2007-11-01   Canberra:  3418   W      : 9780   Min.   :  6.00   N      :11393   SE     :10663   Min.   :  0   
##  1st Qu.:2011-01-06   Sydney  :  3337   SE     : 9309   1st Qu.: 31.00   SE     : 9162   W      : 9911   1st Qu.:  7   
##  Median :2013-05-27   Perth   :  3193   E      : 9071   Median : 39.00   E      : 9024   S      : 9598   Median : 13   
##  Mean   :2013-04-01   Darwin  :  3192   N      : 9033   Mean   : 39.98   SSE    : 8966   WSW    : 9329   Mean   : 14   
##  3rd Qu.:2015-06-12   Hobart  :  3188   SSE    : 8993   3rd Qu.: 48.00   X      : 8612   SW     : 9182   3rd Qu.: 19   
##  Max.   :2017-06-25   Brisbane:  3161   (Other):86677   Max.   :135.00   (Other):93635   (Other):90828   Max.   :130   
##                       (Other) :122704   NA's   : 9330   NA's   :9270     NA's   : 1401   NA's   : 2682   NA's   :1348  
##  wind_speed3pm  
##  Min.   : 0.00  
##  1st Qu.:13.00  
##  Median :19.00  
##  Mean   :18.64  
##  3rd Qu.:24.00  
##  Max.   :87.00  
##  NA's   :2630
sum(is.na(wind_data))
## [1] 26661

Existem 2 cidades que não possuem nenhuma informação sobre wind_gustdir, logo não será possível utilizar a maior direção com maior frequência naquela região como substituição para esses casos.

wind_data$location = as.factor(wind_data$location)
wind_data$wind_gustdir = as.factor(wind_data$wind_gustdir)
wind_data$wind_dir9am = as.factor(wind_data$wind_dir9am)
wind_data$wind_dir3pm = as.factor(wind_data$wind_dir3pm)

winddir_location_mes = wind_data %>% group_by(location,Mes = month(date)) %>% summarise_at(vars(contains("dir")),funs(getmode))
summary(winddir_location_mes)
##           location        Mes         wind_gustdir  wind_dir9am   wind_dir3pm 
##  Adelaide     : 12   Min.   : 1.00   E      : 70   N      : 79   W      : 59  
##  Albany       : 12   1st Qu.: 3.75   W      : 68   X      : 69   WSW    : 52  
##  Albury       : 12   Median : 6.50   N      : 53   E      : 62   N      : 48  
##  AliceSprings : 12   Mean   : 6.50   SE     : 52   SE     : 45   SE     : 48  
##  BadgerysCreek: 12   3rd Qu.: 9.25   SW     : 40   SW     : 39   S      : 47  
##  Ballarat     : 12   Max.   :12.00   (Other):281   W      : 34   SW     : 42  
##  (Other)      :516                   NA's   : 24   (Other):260   (Other):292

Será utilizado a direção do vento mais frequente naquela região dentro do dado mês como substituição para os valores faltantes.

names(winddir_location_mes)[-c(1,2)] = paste0("Mode_",names(winddir_location_mes)[-c(1,2)])
wind_data = wind_data %>% mutate(Mes = month(wind_data$date))

wind_data_clean_1 = wind_data %>% left_join(winddir_location_mes, by = c("location","Mes")) %>% 
        mutate(wind_gustdir = coalesce(wind_gustdir, Mode_wind_gustdir),
               wind_dir9am = coalesce(wind_dir9am, Mode_wind_dir9am),
               wind_dir3pm = coalesce(wind_dir3pm, Mode_wind_dir3pm)) %>% select(-contains("Mode_"))

Para as velocidades utilizarei o valor mensal para cada localidade como substituição para os valores faltantes.

windspd_gust_location_mes = wind_data_clean_1 %>% group_by(location,Mes,wind_gustdir) %>% 
        summarise(Media_wind_gustspeed = mean(wind_gustspeed,na.rm = T))

wind_data_clean_2 = wind_data_clean_1 %>% left_join(windspd_gust_location_mes, by = c("location","Mes","wind_gustdir")) %>% 
                                        mutate(wind_gustspeed = coalesce(wind_gustspeed, Media_wind_gustspeed)) %>% 
                                        select(-Media_wind_gustspeed)


windspd_9am_location_mes = wind_data_clean_1 %>% group_by(location,Mes,wind_dir9am) %>% 
        summarise(Media_wind_speed9am = mean(wind_speed9am,na.rm = T))

wind_data_clean_3 = wind_data_clean_2 %>% left_join(windspd_9am_location_mes, by = c("location","Mes","wind_dir9am")) %>% 
                                        mutate(wind_speed9am = coalesce(wind_speed9am, Media_wind_speed9am)) %>% 
                                        select(-Media_wind_speed9am)

windspd_3pm_location_mes = wind_data_clean_1 %>% group_by(location,Mes,wind_dir3pm) %>% 
        summarise(Media_wind_speed3pm = mean(wind_speed3pm,na.rm = T))

wind_data_clean_4 = wind_data_clean_3 %>% left_join(windspd_3pm_location_mes, by = c("location","Mes","wind_dir3pm")) %>% 
                                        mutate(wind_speed3pm = coalesce(wind_speed3pm, Media_wind_speed3pm)) %>% 
                                        select(-Media_wind_speed3pm)

Todos os valores com exceção dos sobre o vento mais forte foram preenchidos dos dados sobre ventos.

summary(wind_data_clean_4)
##       date                location       wind_gustdir   wind_gustspeed    wind_dir9am     wind_dir3pm    wind_speed9am
##  Min.   :2007-11-01   Canberra:  3418   W      :10688   Min.   :  6.00   N      :11475   SE     :11365   Min.   :  0  
##  1st Qu.:2011-01-06   Sydney  :  3337   SE     : 9452   1st Qu.: 31.00   SE     : 9170   W      :10055   1st Qu.:  7  
##  Median :2013-05-27   Perth   :  3193   E      : 9349   Median : 39.00   E      : 9097   S      : 9677   Median : 13  
##  Mean   :2013-04-01   Darwin  :  3192   SSE    : 9349   Mean   : 40.05   SSE    : 9026   WSW    : 9559   Mean   : 14  
##  3rd Qu.:2015-06-12   Hobart  :  3188   N      : 9116   3rd Qu.: 48.00   X      : 8902   SW     : 9231   3rd Qu.: 19  
##  Max.   :2017-06-25   Brisbane:  3161   (Other):88268   Max.   :135.00   NW     : 8680   SSE    : 9156   Max.   :130  
##                       (Other) :122704   NA's   : 5971   NA's   :5971     (Other):85843   (Other):83150                
##  wind_speed3pm        Mes        
##  Min.   : 0.00   Min.   : 1.000  
##  1st Qu.:13.00   1st Qu.: 3.000  
##  Median :19.00   Median : 6.000  
##  Mean   :18.62   Mean   : 6.403  
##  3rd Qu.:24.00   3rd Qu.: 9.000  
##  Max.   :87.00   Max.   :12.000  
## 

Realizando join da tabela principal com a de vento e verificando que as colunas que ainda possuem NA são: evaporation, sunshine, pressure9am, pressure3pm, cloud9am, cloud3pm, wind_gustdir e wind_gustspeed

wind_data_clean_4 = wind_data_clean_4 %>% select(-Mes)

rain_wind_data = rain_data_clean_1 %>% left_join(wind_data_clean_4, by = c("location","date"))

summary(rain_wind_data)
##       date                location         mintemp         maxtemp         rainfall        evaporation        sunshine    
##  Min.   :2007-11-01   Canberra:  3418   Min.   :-8.50   Min.   :-4.80   Min.   :  0.000   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.:2011-01-06   Sydney  :  3337   1st Qu.: 7.60   1st Qu.:17.90   1st Qu.:  0.000   1st Qu.:  2.60   1st Qu.: 5.40  
##  Median :2013-05-27   Perth   :  3193   Median :12.00   Median :22.60   Median :  0.000   Median :  4.60   Median : 8.20  
##  Mean   :2013-04-01   Darwin  :  3192   Mean   :12.19   Mean   :23.23   Mean   :  2.327   Mean   :  5.31   Mean   : 7.64  
##  3rd Qu.:2015-06-12   Hobart  :  3188   3rd Qu.:16.80   3rd Qu.:28.20   3rd Qu.:  0.600   3rd Qu.:  7.20   3rd Qu.:10.30  
##  Max.   :2017-06-25   Brisbane:  3161   Max.   :33.90   Max.   :48.10   Max.   :371.000   Max.   :145.00   Max.   :14.50  
##                       (Other) :122704                                                     NA's   :45482    NA's   :52071  
##   humidity9am      humidity3pm      pressure9am      pressure3pm        cloud9am        cloud3pm        temp9am     
##  Min.   :  0.00   Min.   :  0.00   Min.   : 980.5   Min.   : 977.1   Min.   :0.00    Min.   :0.00    Min.   :-7.20  
##  1st Qu.: 57.00   1st Qu.: 37.00   1st Qu.:1013.0   1st Qu.:1010.5   1st Qu.:2.00    1st Qu.:2.00    1st Qu.:12.20  
##  Median : 70.00   Median : 52.00   Median :1017.6   Median :1015.2   Median :5.00    Median :5.00    Median :16.70  
##  Mean   : 68.92   Mean   : 51.59   Mean   :1017.6   Mean   :1015.2   Mean   :4.57    Mean   :4.58    Mean   :16.96  
##  3rd Qu.: 83.00   3rd Qu.: 66.00   3rd Qu.:1022.3   3rd Qu.:1019.9   3rd Qu.:7.00    3rd Qu.:7.00    3rd Qu.:21.50  
##  Max.   :100.00   Max.   :100.00   Max.   :1041.0   Max.   :1039.6   Max.   :9.00    Max.   :9.00    Max.   :40.20  
##                                    NA's   :11781    NA's   :11781    NA's   :34034   NA's   :34034                  
##     temp3pm      raintoday      amountOfRain     raintomorrow      temp          humidity      precipitation3pm precipitation9am
##  Min.   :-5.40   No  :109332   Min.   :  0.000   No :110316   Min.   :-3.76   Min.   :  2.00   Min.   : 0.00    Min.   :-17.74  
##  1st Qu.:16.60   Yes : 31455   1st Qu.:  0.000   Yes: 31877   1st Qu.:22.52   1st Qu.: 44.00   1st Qu.: 8.00    1st Qu.:  6.65  
##  Median :21.10   NA's:  1406   Median :  0.000                Median :28.52   Median : 63.20   Median :10.00    Median : 10.00  
##  Mean   :21.72                 Mean   :  2.361                Mean   :28.51   Mean   : 62.11   Mean   :10.01    Mean   : 10.00  
##  3rd Qu.:26.50                 3rd Qu.:  0.800                3rd Qu.:35.48   3rd Qu.: 80.00   3rd Qu.:12.00    3rd Qu.: 13.39  
##  Max.   :46.70                 Max.   :371.000                Max.   :59.72   Max.   :122.00   Max.   :26.00    Max.   : 32.48  
##                                                                                                                                 
##       Dia              Mes             Ano             Week         wind_gustdir   wind_gustspeed    wind_dir9am   
##  1      :  4688   5      :13055   2016   :17508   19     :  2960   W      :10688   Min.   :  6.00   N      :11475  
##  15     :  4688   3      :13036   2014   :17400   22     :  2957   SE     : 9452   1st Qu.: 31.00   SE     : 9170  
##  16     :  4687   1      :12921   2015   :17231   20     :  2948   E      : 9349   Median : 39.00   E      : 9097  
##  2      :  4686   6      :12389   2009   :16595   23     :  2946   SSE    : 9349   Mean   : 40.05   SSE    : 9026  
##  13     :  4686   10     :11804   2010   :16419   10     :  2945   N      : 9116   3rd Qu.: 48.00   X      : 8902  
##  6      :  4684   7      :11779   2013   :16097   25     :  2944   (Other):88268   Max.   :135.00   NW     : 8680  
##  (Other):114074   (Other):67209   (Other):40943   (Other):124493   NA's   : 5971   NA's   :5971     (Other):85843  
##   wind_dir3pm    wind_speed9am wind_speed3pm  
##  SE     :11365   Min.   :  0   Min.   : 0.00  
##  W      :10055   1st Qu.:  7   1st Qu.:13.00  
##  S      : 9677   Median : 13   Median :19.00  
##  WSW    : 9559   Mean   : 14   Mean   :18.62  
##  SW     : 9231   3rd Qu.: 19   3rd Qu.:24.00  
##  SSE    : 9156   Max.   :130   Max.   :87.00  
##  (Other):83150

Clustering

Antes de utilizar a clusterização para preencher os dados faltantes, irei realizar uma clusterização utilizado somente os dados de coordenadas (externos) para a criação de uma noção de proximidade física das localidades. Utilizando o método de cotovelo, foi selecionado o número 4 de clusters.

data = coords[,-1]

elbow(data,20)

## NULL
mod_kmeans = kmeans(data, 4, nstart=50,iter.max = 15 )

rownames(data) <- paste0(coords$city)

fviz_cluster(mod_kmeans, data = data)

coords$Grupo_Prox = as.factor(mod_kmeans$cluster)

rain_wind_data_coords = rain_wind_data %>% left_join(coords, by=c("location"="city"))

pal <- colorFactor(c("blue","yellow","green","red","purple","black"), domain = levels(coords$Grupo_Prox))

leaflet(coords) %>% addProviderTiles(providers$OpenTopoMap) %>%  addCircleMarkers(lng = ~lng, lat = ~lat, color = ~pal(Grupo_Prox), stroke = FALSE,fillOpacity = .8, popup = ~city) %>% addLegend("topright", pal = pal, values = ~Grupo_Prox,
    title = "Grupo Proximidade",
    labFormat = labelFormat(prefix = ""),
    opacity = 1
  )

Agora será realizado uma clusterização com as colunas que não possuem NA, para encontrar regiões com comportamento parecido na esperança de utilizar a média mensal desses clusters para preencher os NA restantes, sob a hipótese de que esses clusters possam agregar da melhor forma possível regiões semelhantes. Esses clusters criados não serão utilizados para propósito de modelagem para evitar redundância.

cols_sem_na = c("location","mintemp", "maxtemp", "rainfall", "humidity9am", "humidity3pm", "temp9am", "temp3pm", 
                "amountOfRain", "temp", "humidity", "precipitation3pm", "precipitation9am", "wind_dir9am", "wind_dir3pm", 
                "wind_speed9am", "wind_speed3pm","lat","lng")

df_clust = rain_wind_data_coords[,cols_sem_na]

df_clust$wind_dir3pm = as.character(df_clust$wind_dir3pm)
df_clust$wind_dir9am = as.character(df_clust$wind_dir9am)

df_clust = df_clust %>% group_by(location) %>% 
                    summarise_all(funs(if(is.numeric(.)) mean(., na.rm = TRUE) else getmode(.)))


df_clust$wind_dir3pm = as.factor(df_clust$wind_dir3pm)
df_clust$wind_dir9am = as.factor(df_clust$wind_dir9am)

df_clust_prep = df_clust %>% mutate_if(is.numeric,scale)

data = model.matrix(location~.,df_clust_prep)[,-1]

elbow(data,20)

## NULL
#Escolhido 5 clusters

mod_kmeans = kmeans(data, 5, nstart=50,iter.max = 15 )

rownames(data) <- paste0(df_clust$location)

fviz_cluster(mod_kmeans, data = data)

df_clust$Cluster = as.factor(mod_kmeans$cluster)

pal <- colorFactor(c("blue","yellow","green","red","purple","black"), domain = levels(df_clust$Cluster))

leaflet(df_clust) %>% addProviderTiles(providers$OpenTopoMap) %>%  addCircleMarkers(lng = ~lng, lat = ~lat, color = ~pal(Cluster), stroke = FALSE,fillOpacity = .8, popup = ~location) %>% addLegend("topright", pal = pal, values = ~Cluster,
    title = "Cluster",
    labFormat = labelFormat(prefix = ""),
    opacity = 1
  )
rain_wind_data_cl = rain_wind_data_coords %>% left_join(df_clust %>% select(location,Cluster),by="location")

Agora o agrupamento será realizado a partir dos clusters encontrados ao invés da localização e a partir do mês. Criação de uma variável com a variação de temperatura do dia.

clust_loc = rain_wind_data_cl %>% group_by(Cluster,Mes) %>% 
                             summarise(Var_evaporation = mean(evaporation, na.rm = T),
                                       Var_sunshine = mean(sunshine, na.rm = T),
                                       Var_pressure9am = mean(pressure9am, na.rm = T),
                                       Var_pressure3pm = mean(pressure3pm, na.rm = T),
                                       Var_cloud9am = mean(cloud9am, na.rm = T),
                                       Var_cloud3pm = mean(cloud3pm, na.rm = T),
                                       Var_wind_gustdir = getmode(wind_gustdir),
                                       Var_wind_gustspeed = mean(wind_gustspeed, na.rm = T),
                                       Var_raintoday = getmode(raintoday))


rain_wind_data_final = rain_wind_data_cl %>% left_join(clust_loc, by = c("Cluster","Mes")) %>% 
                                        mutate(evaporation = coalesce(evaporation,Var_evaporation),
                                                  sunshine = coalesce(sunshine,Var_sunshine),
                                                  pressure9am = coalesce(pressure9am,Var_pressure9am),
                                                  pressure3pm = coalesce(pressure3pm,Var_pressure3pm),
                                                  cloud9am = coalesce(cloud9am,Var_cloud9am),
                                                  cloud3pm = coalesce(cloud3pm,Var_cloud3pm),
                                                  wind_gustdir =coalesce(wind_gustdir,Var_wind_gustdir),
                                                  wind_gustspeed =
                                                  coalesce(wind_gustspeed,Var_wind_gustspeed),
                                                  raintoday = coalesce(raintoday,Var_raintoday)) %>%
                                        select(-contains("Var_"),-Cluster)



rain_wind_data_final = rain_wind_data_final %>% mutate(Var_Temp = maxtemp - mintemp)

Preenchendo assim todos os dados faltantes que restavam.

summary(rain_wind_data_final)
##       date              location            mintemp         maxtemp         rainfall        evaporation         sunshine     
##  Min.   :2007-11-01   Length:142193      Min.   :-8.50   Min.   :-4.80   Min.   :  0.000   Min.   :  0.000   Min.   : 0.000  
##  1st Qu.:2011-01-06   Class :character   1st Qu.: 7.60   1st Qu.:17.90   1st Qu.:  0.000   1st Qu.:  2.630   1st Qu.: 5.800  
##  Median :2013-05-27   Mode  :character   Median :12.00   Median :22.60   Median :  0.000   Median :  4.620   Median : 7.782  
##  Mean   :2013-04-01                      Mean   :12.19   Mean   :23.23   Mean   :  2.327   Mean   :  5.198   Mean   : 7.528  
##  3rd Qu.:2015-06-12                      3rd Qu.:16.80   3rd Qu.:28.20   3rd Qu.:  0.600   3rd Qu.:  7.000   3rd Qu.: 9.500  
##  Max.   :2017-06-25                      Max.   :33.90   Max.   :48.10   Max.   :371.000   Max.   :145.000   Max.   :14.500  
##                                                                                                                              
##   humidity9am      humidity3pm      pressure9am      pressure3pm        cloud9am        cloud3pm        temp9am     
##  Min.   :  0.00   Min.   :  0.00   Min.   : 980.5   Min.   : 977.1   Min.   :0.000   Min.   :0.000   Min.   :-7.20  
##  1st Qu.: 57.00   1st Qu.: 37.00   1st Qu.:1013.3   1st Qu.:1010.8   1st Qu.:3.000   1st Qu.:3.163   1st Qu.:12.20  
##  Median : 70.00   Median : 52.00   Median :1017.7   Median :1015.4   Median :5.000   Median :4.951   Median :16.70  
##  Mean   : 68.92   Mean   : 51.59   Mean   :1017.7   Mean   :1015.3   Mean   :4.658   Mean   :4.649   Mean   :16.96  
##  3rd Qu.: 83.00   3rd Qu.: 66.00   3rd Qu.:1022.1   3rd Qu.:1019.7   3rd Qu.:6.547   3rd Qu.:6.000   3rd Qu.:21.50  
##  Max.   :100.00   Max.   :100.00   Max.   :1041.0   Max.   :1039.6   Max.   :9.000   Max.   :9.000   Max.   :40.20  
##                                                                                                                     
##     temp3pm      raintoday     amountOfRain     raintomorrow      temp          humidity      precipitation3pm precipitation9am
##  Min.   :-5.40   No :110734   Min.   :  0.000   No :110316   Min.   :-3.76   Min.   :  2.00   Min.   : 0.00    Min.   :-17.74  
##  1st Qu.:16.60   Yes: 31459   1st Qu.:  0.000   Yes: 31877   1st Qu.:22.52   1st Qu.: 44.00   1st Qu.: 8.00    1st Qu.:  6.65  
##  Median :21.10                Median :  0.000                Median :28.52   Median : 63.20   Median :10.00    Median : 10.00  
##  Mean   :21.72                Mean   :  2.361                Mean   :28.51   Mean   : 62.11   Mean   :10.01    Mean   : 10.00  
##  3rd Qu.:26.50                3rd Qu.:  0.800                3rd Qu.:35.48   3rd Qu.: 80.00   3rd Qu.:12.00    3rd Qu.: 13.39  
##  Max.   :46.70                Max.   :371.000                Max.   :59.72   Max.   :122.00   Max.   :26.00    Max.   : 32.48  
##                                                                                                                                
##       Dia              Mes             Ano             Week         wind_gustdir   wind_gustspeed    wind_dir9am   
##  1      :  4688   5      :13055   2016   :17508   19     :  2960   W      :13153   Min.   :  6.00   N      :11475  
##  15     :  4688   3      :13036   2014   :17400   22     :  2957   N      :10387   1st Qu.: 31.00   SE     : 9170  
##  16     :  4687   1      :12921   2015   :17231   20     :  2948   SSE    :10314   Median : 39.00   E      : 9097  
##  2      :  4686   6      :12389   2009   :16595   23     :  2946   SE     : 9944   Mean   : 40.02   SSE    : 9026  
##  13     :  4686   10     :11804   2010   :16419   10     :  2945   E      : 9349   3rd Qu.: 46.53   X      : 8902  
##  6      :  4684   7      :11779   2013   :16097   25     :  2944   ENE    : 9051   Max.   :135.00   NW     : 8680  
##  (Other):114074   (Other):67209   (Other):40943   (Other):124493   (Other):79995                    (Other):85843  
##   wind_dir3pm    wind_speed9am wind_speed3pm        lat              lng        Grupo_Prox    Var_Temp     
##  SE     :11365   Min.   :  0   Min.   : 0.00   Min.   :-42.85   Min.   :115.1   1:20706    Min.   :-1.895  
##  W      :10055   1st Qu.:  7   1st Qu.:13.00   1st Qu.:-36.06   1st Qu.:138.6   2:55323    1st Qu.: 7.200  
##  S      : 9677   Median : 13   Median :19.00   Median :-33.94   Median :145.8   3:15324    Median :10.500  
##  WSW    : 9559   Mean   : 14   Mean   :18.62   Mean   :-32.75   Mean   :142.1   4:50840    Mean   :11.039  
##  SW     : 9231   3rd Qu.: 19   3rd Qu.:24.00   3rd Qu.:-31.47   3rd Qu.:150.7              3rd Qu.:14.500  
##  SSE    : 9156   Max.   :130   Max.   :87.00   Max.   :-12.43   Max.   :168.0              Max.   :31.200  
##  (Other):83150

Modelagem

Divisão dos dados em treino (80%), teste (10%) e validação(10%)

rain_wind_data_final$raintoday = ifelse(rain_wind_data_final$raintoday == "Yes",1,0)

set.seed(1234)
#part = createDataPartition(rain_wind_data_final$raintomorrow,p=.8,list = F)

rain_wind_data_final$raintomorrow = as.factor(ifelse(rain_wind_data_final$raintomorrow=="Yes",1,0))

rain_wind_data_final$location = as.factor(rain_wind_data_final$location)

train = rain_wind_data_final[part,]
test = rain_wind_data_final[-part,]

set.seed(1234)
#part_val = createDataPartition(test$raintomorrow,p=.5,list = F)

validation = test[part_val,]
test = test[-part_val,]

trainctrl <- trainControl(verboseIter = TRUE,method="cv", number=3,allowParallel = TRUE)

Criação da variável Choveu_Grupo, que verifica se em alguma das localidades do grupo de proximidade houve relato de chuva naquele dia. Criação dessa variável nos dados de treino, para evitar overfitting.

chuva_prox = train %>% group_by(Grupo_Prox,date) %>% 
                                  summarise(Choveu_Grupo = as.factor(max(raintoday,na.rm = T)))

train$raintoday = as.factor(train$raintoday)

train = train %>% left_join(chuva_prox)
## Joining, by = c("date", "Grupo_Prox")
train$Choveu_Grupo = as.factor(train$Choveu_Grupo)



prop.table(table(Chove_Amanha = train$raintomorrow,Choveu_Proximo = train$Choveu_Grupo),margin=1)
##             Choveu_Proximo
## Chove_Amanha         0         1
##            0 0.4177422 0.5822578
##            1 0.1720257 0.8279743
prep = preProcess(train,c('center','scale'))

train = predict(prep,train)

test = test %>% left_join(chuva_prox)
## Joining, by = c("date", "Grupo_Prox")
test$Choveu_Grupo[is.na(test$Choveu_Grupo)] = 0
test$Choveu_Grupo = as.factor(test$Choveu_Grupo)
test$raintoday = as.factor(test$raintoday)

test = predict(prep,test)

Podemos notar que aproximadamente 80% das vezes que chove no dia seguinte, houve alguma chuva dentro do grupo de proximidade.

Como a variável resposta é muito desbalanceada, utilizarei um método de up sampling para equilibrá-la.

sim = train[train$raintomorrow==1,]
nao = train[!train$raintomorrow==1,]

set.seed(1234)
#samp_sim = sample(nrow(sim),50000,replace = T)
set.seed(1234)
#samp_nao = sample(nrow(nao),50000,replace = F)

train_bal = rbind(sim[samp_sim,],nao[samp_nao,])

Seleção das variáveis que serão utilizadas na composição do modelo inicial.

vars_modelo = c("location", "mintemp", "maxtemp", "rainfall", "evaporation", "sunshine",        
"humidity9am", "humidity3pm", "pressure9am", "pressure3pm", "cloud9am", "cloud3pm", "temp9am",         
"temp3pm", "raintoday", "temp", "humidity", "precipitation3pm", "precipitation9am", "Mes", "wind_gustdir", "wind_gustspeed", "wind_dir9am", "wind_dir3pm", "wind_speed9am", "wind_speed3pm", "Grupo_Prox", "Var_Temp", "Choveu_Grupo", "raintomorrow")

train_bal = train_bal[,vars_modelo]

train = train[,vars_modelo]

Arvore de Decisão

#model_rpart = train(raintomorrow~.,train_bal,"rpart",tuneLength = 3,trControl = trainctrl)

pred_rpart = predict(model_rpart,test)

confusionMatrix(pred_rpart,test$raintomorrow)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 8190  821
##          1 2841 2366
##                                           
##                Accuracy : 0.7424          
##                  95% CI : (0.7352, 0.7496)
##     No Information Rate : 0.7758          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3957          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.7425          
##             Specificity : 0.7424          
##          Pos Pred Value : 0.9089          
##          Neg Pred Value : 0.4544          
##              Prevalence : 0.7758          
##          Detection Rate : 0.5760          
##    Detection Prevalence : 0.6338          
##       Balanced Accuracy : 0.7424          
##                                           
##        'Positive' Class : 0               
## 
aux_rpart = rocp(model_rpart,test,test$raintomorrow)

## NULL
acc_rpart = confusionMatrix(pred_rpart,test$raintomorrow)$overall[1]

plot_density_pred(model_rpart,test)

teste = predict(model_rpart,test,"prob")[,2]
teste2 = as.factor(ifelse(teste>.3,1,0))

confusionMatrix(teste2,test$raintomorrow)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6162  527
##          1 4869 2660
##                                           
##                Accuracy : 0.6205          
##                  95% CI : (0.6124, 0.6285)
##     No Information Rate : 0.7758          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2649          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.5586          
##             Specificity : 0.8346          
##          Pos Pred Value : 0.9212          
##          Neg Pred Value : 0.3533          
##              Prevalence : 0.7758          
##          Detection Rate : 0.4334          
##    Detection Prevalence : 0.4705          
##       Balanced Accuracy : 0.6966          
##                                           
##        'Positive' Class : 0               
## 

Regressão Logística

#model_logistic = train(raintomorrow~.,train_bal,"glm",family="binomial")

pred_logistic = predict(model_logistic,test)

confusionMatrix(pred_logistic,test$raintomorrow)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 8834  702
##          1 2197 2485
##                                           
##                Accuracy : 0.7961          
##                  95% CI : (0.7894, 0.8027)
##     No Information Rate : 0.7758          
##     P-Value [Acc > NIR] : 2.539e-09       
##                                           
##                   Kappa : 0.4976          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8008          
##             Specificity : 0.7797          
##          Pos Pred Value : 0.9264          
##          Neg Pred Value : 0.5308          
##              Prevalence : 0.7758          
##          Detection Rate : 0.6213          
##    Detection Prevalence : 0.6707          
##       Balanced Accuracy : 0.7903          
##                                           
##        'Positive' Class : 0               
## 
aux_logistic = rocp(model_logistic,test,test$raintomorrow)

## NULL
acc_logistic = confusionMatrix(pred_logistic,test$raintomorrow)$overall[1]

plot_density_pred(model_logistic,test)

XGBoost

#model_xgb = train(raintomorrow~.,train_bal,"xgbTree",tuneLength = 3,trControl = trainctrl)

pred_xgb = predict(model_xgb,test)

confusionMatrix(pred_xgb,test$raintomorrow)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 9153  753
##          1 1878 2434
##                                           
##                Accuracy : 0.815           
##                  95% CI : (0.8085, 0.8213)
##     No Information Rate : 0.7758          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5273          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8298          
##             Specificity : 0.7637          
##          Pos Pred Value : 0.9240          
##          Neg Pred Value : 0.5645          
##              Prevalence : 0.7758          
##          Detection Rate : 0.6438          
##    Detection Prevalence : 0.6967          
##       Balanced Accuracy : 0.7967          
##                                           
##        'Positive' Class : 0               
## 
aux_xgb = rocp(model_xgb,test,test$raintomorrow)

## NULL
acc_xgb = confusionMatrix(pred_xgb,test$raintomorrow)$overall[1]

plot_density_pred(model_xgb,test)

Conditional Tree

#model_ctree = train(raintomorrow~.,train_bal,"ctree",tuneLength = 3,trControl = trainctrl)

pred_ctree = predict(model_ctree,test)

confusionMatrix(pred_ctree,test$raintomorrow)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 8526  880
##          1 2505 2307
##                                           
##                Accuracy : 0.7619          
##                  95% CI : (0.7548, 0.7689)
##     No Information Rate : 0.7758          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.4206          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.7729          
##             Specificity : 0.7239          
##          Pos Pred Value : 0.9064          
##          Neg Pred Value : 0.4794          
##              Prevalence : 0.7758          
##          Detection Rate : 0.5997          
##    Detection Prevalence : 0.6616          
##       Balanced Accuracy : 0.7484          
##                                           
##        'Positive' Class : 0               
## 
auc_ctree = rocp(model_ctree,test,test$raintomorrow)

## NULL
acc_ctree = confusionMatrix(pred_ctree,test$raintomorrow)$overall[1]

plot_density_pred(model_ctree,test)

XGBoost Amostra Desbalanceada

O XGBoost possui um hiperparâmetro específico para lidar com amostras desbalanceadas chamado scale_pos_weight, o valor recomendado para elé número de amostras negativas/número de amostras positivas.

#mod_xgb_weighted = train(raintomorrow~. ,train,"xgbTree", tuneLength = 3,trControl = trainctrl, scale_pos_weight = 3.46)

pred_xgb_weighted = predict(mod_xgb_weighted,test)

confusionMatrix(pred_xgb_weighted,test$raintomorrow)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 10898  2179
##          1   133  1008
##                                           
##                Accuracy : 0.8374          
##                  95% CI : (0.8312, 0.8434)
##     No Information Rate : 0.7758          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3942          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9879          
##             Specificity : 0.3163          
##          Pos Pred Value : 0.8334          
##          Neg Pred Value : 0.8834          
##              Prevalence : 0.7758          
##          Detection Rate : 0.7665          
##    Detection Prevalence : 0.9197          
##       Balanced Accuracy : 0.6521          
##                                           
##        'Positive' Class : 0               
## 
auc_xgb_weighted=rocp(mod_xgb_weighted,test,test$raintomorrow)

## NULL
acc_xgb_weighted = confusionMatrix(pred_xgb_weighted,test$raintomorrow)$overall[1]

plot_density_pred(mod_xgb_weighted,test)

Random Forest

#mod_rf = train(raintomorrow~. ,train_bal,"rf", tuneLength = 3,trControl = trainctrl)

pred_rf = predict(mod_rf,test)

confusionMatrix(pred_rf,test$raintomorrow)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 9841 1050
##          1 1190 2137
##                                           
##                Accuracy : 0.8425          
##                  95% CI : (0.8364, 0.8484)
##     No Information Rate : 0.7758          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.554           
##                                           
##  Mcnemar's Test P-Value : 0.003315        
##                                           
##             Sensitivity : 0.8921          
##             Specificity : 0.6705          
##          Pos Pred Value : 0.9036          
##          Neg Pred Value : 0.6423          
##              Prevalence : 0.7758          
##          Detection Rate : 0.6922          
##    Detection Prevalence : 0.7660          
##       Balanced Accuracy : 0.7813          
##                                           
##        'Positive' Class : 0               
## 
auc_rf = rocp(mod_rf,test,test$raintomorrow)

## NULL
acc_rf = confusionMatrix(pred_rf,test$raintomorrow)$overall[1]

plot_density_pred(mod_rf,test)

Tuning XGBoost

Na tentativa de melhorar a performance do modelo do XGBoost, será realizado um tuning dos seus hiperparâmetros.

nrounds <- 1000
tune_grid <- expand.grid(
  nrounds = seq(from = 200, to = nrounds, by = 50),
  eta = c(0.025, 0.05, 0.1, 0.3),
  max_depth = c(2, 3, 4, 5, 6),
  gamma = 0,
  colsample_bytree = 1,
  min_child_weight = 1,
  subsample = 1
)

trainctrl <- trainControl(verboseIter = TRUE,method="cv", number=3)
#tune_xgb1 = train(raintomorrow~.,train_bal,"xgbTree",trControl = trainctrl,tuneGrid = tune_grid)


ggplot(tune_xgb1) + theme_bw() + ylim(c(min(tune_xgb1$results$Accuracy),1))

maxd = tune_xgb1$bestTune$max_depth

tune_grid2 <- expand.grid(
  nrounds = seq(from = 50, to = nrounds, by = 50),
  eta = tune_xgb1$bestTune$eta,
  max_depth = c((maxd-1):(maxd+1)),
  gamma = 0,
  colsample_bytree = 1,
  min_child_weight = c(1, 2, 3),
  subsample = 1
)

#tune_xgb2 = train(raintomorrow~.,train_bal,"xgbTree",trControl = trainctrl,tuneGrid = tune_grid2)

ggplot(tune_xgb2) + theme_bw() + ylim(c(min(tune_xgb2$results$Accuracy),1))

tune_grid3 <- expand.grid(
  nrounds = seq(from = 50, to = nrounds, by = 50),
  eta = tune_xgb1$bestTune$eta,
  max_depth = 6,
  gamma = 0,
  colsample_bytree = c(0.4, 0.6, 0.8, 1.0),
  min_child_weight = tune_xgb2$bestTune$min_child_weight,
  subsample = c(0.5, 0.75, 1.0)
)

#tune_xgb3 = train(raintomorrow~.,train_bal,"xgbTree",trControl = trainctrl,tuneGrid = tune_grid3)

ggplot(tune_xgb3) + theme_bw() + ylim(c(min(tune_xgb3$results$Accuracy),1))

tune_grid4 <- expand.grid(
  nrounds = seq(from = 50, to = nrounds, by = 50),
  eta = tune_xgb1$bestTune$eta,
  max_depth = 6,
  gamma = c(0, 0.05, 0.1, 0.5, 0.7, 0.9, 1.0),
  colsample_bytree = tune_xgb3$bestTune$colsample_bytree,
  min_child_weight = tune_xgb2$bestTune$min_child_weight,
  subsample = tune_xgb3$bestTune$subsample
)
#tune_xgb4 = train(raintomorrow~.,train_bal,"xgbTree",trControl = trainctrl,tuneGrid = tune_grid4)

ggplot(tune_xgb4) + theme_bw() + ylim(c(min(tune_xgb4$results$Accuracy),1))
## Warning: The shape palette can deal with a maximum of 6 discrete values because more than 6 becomes difficult to
## discriminate; you have 7. Consider specifying shapes manually if you must have them.
## Warning: Removed 20 rows containing missing values (geom_point).

tune_grid5 <- expand.grid(
  nrounds = seq(from = 100, to = 10000, by = 100),
  eta = c(0.01, 0.015, 0.025, 0.05, 0.1),
  max_depth = 6,
  gamma = tune_xgb4$bestTune$gamma,
  colsample_bytree = tune_xgb3$bestTune$colsample_bytree,
  min_child_weight = tune_xgb2$bestTune$min_child_weight,
  subsample = tune_xgb3$bestTune$subsample
)

#tune_xgb5 = train(raintomorrow~.,train_bal,"xgbTree",trControl = trainctrl,tuneGrid = tune_grid5)

ggplot(tune_xgb5) + theme_bw() + ylim(c(min(tune_xgb5$results$Accuracy),1))

final_grid <- expand.grid(
  nrounds = 2800,
  eta = 0.1,
  max_depth = 6,
  gamma = tune_xgb5$bestTune$gamma,
  colsample_bytree = tune_xgb5$bestTune$colsample_bytree,
  min_child_weight = tune_xgb5$bestTune$min_child_weight,
  subsample = tune_xgb5$bestTune$subsample)

#mod_xgb_tuned = train(raintomorrow~.,train_bal,"xgbTree",trControl = trainctrl,tuneGrid = final_grid)

pred_xgb_tuned = predict(mod_xgb_tuned,test)

confusionMatrix(pred_xgb_tuned,test$raintomorrow)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 9700  963
##          1 1331 2224
##                                           
##                Accuracy : 0.8387          
##                  95% CI : (0.8325, 0.8447)
##     No Information Rate : 0.7758          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5544          
##                                           
##  Mcnemar's Test P-Value : 1.824e-14       
##                                           
##             Sensitivity : 0.8793          
##             Specificity : 0.6978          
##          Pos Pred Value : 0.9097          
##          Neg Pred Value : 0.6256          
##              Prevalence : 0.7758          
##          Detection Rate : 0.6822          
##    Detection Prevalence : 0.7500          
##       Balanced Accuracy : 0.7886          
##                                           
##        'Positive' Class : 0               
## 
auc_xgb_tuned = rocp(mod_xgb_tuned,test,test$raintomorrow)

## NULL
acc_xgb_tuned = confusionMatrix(pred_xgb_tuned,test$raintomorrow)$overall[1]

plot_density_pred(mod_xgb_tuned,test)

Comparativo

accs = c(acc_rpart,acc_logistic,acc_xgb,acc_ctree,acc_xgb_weighted,acc_rf, acc_xgb_tuned)
aucs = c(aux_rpart[[1]],aux_logistic[[1]],aux_xgb[[1]],auc_ctree[[1]],auc_xgb_weighted[[1]],auc_rf[[1]],auc_xgb_tuned[[1]])


df = data.frame(Modelo = c("Decision Tree",
                           "Logistic Regression",
                           "XGBoost","Conditional Tree",
                           "XGBoost Unbalanced", "Random Forest", "XGBoost Tuned"), Accuracy=accs,AUC=aucs)


df
##                Modelo  Accuracy       AUC
## 1       Decision Tree 0.7424392 0.7660589
## 2 Logistic Regression 0.7961035 0.8722469
## 3             XGBoost 0.8149529 0.8820449
## 4    Conditional Tree 0.7619215 0.8173365
## 5  XGBoost Unbalanced 0.8373892 0.8827417
## 6       Random Forest 0.8424532 0.8846150
## 7       XGBoost Tuned 0.8386552 0.8847764
preds_list = list(predict(model_rpart,test,"prob")[,2],
          predict(model_logistic,test,"prob")[,2],
          predict(model_xgb,test,"prob")[,2],
          predict(model_ctree,test,"prob")[,2],
          predict(mod_xgb_weighted,test,"prob")[,2],
          predict(mod_rf,test,"prob")[,2],
          predict(mod_xgb_tuned,test,"prob")[,2])
m <- length(preds_list)
actuals_list <- rep(list(test$raintomorrow), m)

pred <- prediction(preds_list, actuals_list)
rocs <- performance(pred, "tpr", "fpr")
plot(rocs, col = as.list(1:m), main = "Test Set ROC Curves")
legend(x = "bottomright", 
       legend = c("Decision Tree",
                           "Logistic Regression",
                           "XGBoost","Conditional Tree",
                           "XGBoost Unbalanced", "Random Forest","XGBoost Tuned"),
       fill = 1:m)

O modelo escohido foi XGBoost Tuned, o método de seleção foi o AUC. O ponto de corte foi alterado para 0.35, atingindo assim as seguintes métricas para a base de validação.

validation = validation %>% left_join(chuva_prox)
## Joining, by = c("date", "Grupo_Prox")
validation$Choveu_Grupo[is.na(validation$Choveu_Grupo)] = 0
validation$Choveu_Grupo = as.factor(validation$Choveu_Grupo)
validation$raintoday = as.factor(validation$raintoday)

validation = predict(prep,validation)

pred_thresh = predict(mod_xgb_tuned,validation,"prob")[,2]
pred_thresh = as.factor(ifelse(pred_thresh>.35,1,0))

confusionMatrix(pred_thresh,validation$raintomorrow)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 9183  722
##          1 1849 2466
##                                           
##                Accuracy : 0.8192          
##                  95% CI : (0.8128, 0.8255)
##     No Information Rate : 0.7758          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5383          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8324          
##             Specificity : 0.7735          
##          Pos Pred Value : 0.9271          
##          Neg Pred Value : 0.5715          
##              Prevalence : 0.7758          
##          Detection Rate : 0.6458          
##    Detection Prevalence : 0.6966          
##       Balanced Accuracy : 0.8030          
##                                           
##        'Positive' Class : 0               
## 

Conclusão

O modelo escolhido possui um resultado satisfatório, entretanto necessita melhora para o caso dos falsos positivos, talvez a tentativa de algum outro modelo ou a criação de algumas outras variáveis possa criar uma melhora na performance.